sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.2 magrittr_1.5 tools_3.5.2 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.0 stringi_1.2.4 rmarkdown_1.11
## [9] knitr_1.21 stringr_1.3.1 xfun_0.4 digest_0.6.18
## [13] evaluate_0.12
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
## input attrition file
setwd("~/Documents/SMU/Doing DS/Unit 14")
emp <- read.xlsx("CaseStudy2-data.xlsx", sheet = 1, startRow = 1, colNames = TRUE)
#emp <- read.xlsx("CaseStudy2-data.xlsx", sheet = 1, startRow = 1, colNames = TRUE, rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE, skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE, namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE)
head(emp)
## Age Attrition Attr_Ind BusinessTravel BT1 BT2 DailyRate
## 1 41 Yes 1 Travel_Rarely 1 0 1102
## 2 49 No 0 Travel_Frequently 0 1 279
## 3 37 Yes 1 Travel_Rarely 1 0 1373
## 4 33 No 0 Travel_Frequently 0 1 1392
## 5 27 No 0 Travel_Rarely 1 0 591
## 6 32 No 0 Travel_Frequently 0 1 1005
## Department DistanceFromHome Education EducationField
## 1 Sales 1 2 Life Sciences
## 2 Research & Development 8 1 Life Sciences
## 3 Research & Development 2 2 Other
## 4 Research & Development 3 4 Life Sciences
## 5 Research & Development 2 1 Medical
## 6 Research & Development 2 2 Life Sciences
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate
## 1 1 1 2 Female 94
## 2 1 2 3 Male 61
## 3 1 4 4 Male 92
## 4 1 5 4 Female 56
## 5 1 7 1 Male 40
## 6 1 8 4 Male 79
## JobInvolvement JobLevel JobRole JobSatisfaction
## 1 3 2 Sales Executive 4
## 2 2 2 Research Scientist 2
## 3 2 1 Laboratory Technician 3
## 4 3 1 Research Scientist 3
## 5 3 1 Laboratory Technician 2
## 6 3 1 Laboratory Technician 4
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18
## 1 Single 5993 19479 8 Y
## 2 Married 5130 24907 1 Y
## 3 Single 2090 2396 6 Y
## 4 Married 2909 23159 1 Y
## 5 Married 3468 16632 9 Y
## 6 Single 3068 11864 0 Y
## OverTime OT_ind PercentSalaryHike PerformanceRating
## 1 Yes 1 11 3
## 2 No 0 23 4
## 3 Yes 1 15 3
## 4 Yes 1 11 3
## 5 No 0 12 3
## 6 No 0 13 3
## RelationshipSatisfaction StandardHours StockOptionLevel
## 1 1 80 0
## 2 4 80 1
## 3 2 80 0
## 4 3 80 0
## 5 4 80 1
## 6 3 80 0
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 8 0 1 6
## 2 10 3 3 10
## 3 7 3 3 0
## 4 8 3 3 8
## 5 6 3 3 2
## 6 8 2 2 7
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1 4 0 5
## 2 7 1 7
## 3 0 0 0
## 4 7 3 0
## 5 2 2 2
## 6 7 3 6
#Missing values check
#dimension
dim(emp)
## [1] 1470 39
#summary
summary(emp$MonthlyIncome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1009 2911 4919 6503 8379 19999
summary(emp$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 30.00 36.00 36.92 43.00 60.00
summary(as.numeric(emp$YearsAtCompany))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 5.000 7.008 9.000 40.000
summary(emp$NumCompaniesWorked)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 2.693 4.000 9.000
summary(emp$DistanceFromHome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 7.000 9.193 14.000 29.000
summary(emp$YearsInCurrentRole)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 4.229 7.000 18.000
summary(emp$JobSatisfaction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.729 4.000 4.000
hist(emp$MonthlyIncome)
hist(emp$Age)
hist(as.numeric(emp$YearsAtCompany))
hist(emp$NumCompaniesWorked)
hist(emp$DistanceFromHome)
hist(emp$YearsInCurrentRole)
freq(emp$Gender)
## n % val%
## Female 588 40 40
## Male 882 60 60
freq(emp$Education)
## n % val%
## 1 170 11.6 11.6
## 2 282 19.2 19.2
## 3 572 38.9 38.9
## 4 398 27.1 27.1
## 5 48 3.3 3.3
freq(emp$JobRole)
## n % val%
## Healthcare Representative 131 8.9 8.9
## Human Resources 52 3.5 3.5
## Laboratory Technician 259 17.6 17.6
## Manager 102 6.9 6.9
## Manufacturing Director 145 9.9 9.9
## Research Director 80 5.4 5.4
## Research Scientist 292 19.9 19.9
## Sales Executive 326 22.2 22.2
## Sales Representative 83 5.6 5.6
## input age file
fit <- lm(Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany + OT_ind + BT1 + BT2 + Attr_Ind, data=emp)
summary(fit)
##
## Call:
## lm(formula = Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany +
## OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.3916 -5.4903 -0.9349 4.2522 26.7550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.9418089 0.7173773 38.950 < 2e-16 ***
## MonthlyIncome 0.0007144 0.0000506 14.119 < 2e-16 ***
## NumCompaniesWorked 0.9776191 0.0818675 11.941 < 2e-16 ***
## YearsAtCompany 0.2082472 0.0384393 5.418 7.06e-08 ***
## OT_ind 1.2025377 0.4501058 2.672 0.00763 **
## BT1 0.3568153 0.6581930 0.542 0.58782
## BT2 0.4648894 0.7681020 0.605 0.54511
## Attr_Ind -2.7124446 0.5644751 -4.805 1.70e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.518 on 1462 degrees of freedom
## Multiple R-squared: 0.326, Adjusted R-squared: 0.3227
## F-statistic: 101 on 7 and 1462 DF, p-value: < 2.2e-16
# Assessing Outliers
#outlierTest(fit) # Bonferonni p-value for most extreme obs
qqPlot(fit, main="QQ Plot") #qq plot for studentized resid
## [1] 939 1355
leveragePlots(fit) # leverage plots
# Normality of Residuals
# distribution of studentized residuals
library(MASS)
sresid <- studres(fit)
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.4922889, Df = 1, p = 0.48291
# plot studentized residuals vs. fitted values
spreadLevelPlot(fit)
##
## Suggested power transformation: 0.7645897
# Evaluate Collinearity
vif(fit) # variance inflation factors
## MonthlyIncome NumCompaniesWorked YearsAtCompany
## 1.475043 1.086961 1.441393
## OT_ind BT1 BT2
## 1.069121 2.322108 2.346506
## Attr_Ind
## 1.120636
sqrt(vif(fit)) > 2 # problem?
## MonthlyIncome NumCompaniesWorked YearsAtCompany
## FALSE FALSE FALSE
## OT_ind BT1 BT2
## FALSE FALSE FALSE
## Attr_Ind
## FALSE
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.4922889, Df = 1, p = 0.48291
# plot studentized residuals vs. fitted values
spreadLevelPlot(fit)
##
## Suggested power transformation: 0.7645897
# Evaluate Nonlinearity
# component + residual plot
crPlots(fit)
# Ceres plots
#ceresPlots(fit)
# Test for Autocorrelated Errors
durbinWatsonTest(fit)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01150988 2.02301 0.632
## Alternative hypothesis: rho != 0
# Global test of model assumptions
library(gvlma)
gvmodel <- gvlma(fit)
summary(gvmodel)
##
## Call:
## lm(formula = Age ~ MonthlyIncome + NumCompaniesWorked + YearsAtCompany +
## OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.3916 -5.4903 -0.9349 4.2522 26.7550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.9418089 0.7173773 38.950 < 2e-16 ***
## MonthlyIncome 0.0007144 0.0000506 14.119 < 2e-16 ***
## NumCompaniesWorked 0.9776191 0.0818675 11.941 < 2e-16 ***
## YearsAtCompany 0.2082472 0.0384393 5.418 7.06e-08 ***
## OT_ind 1.2025377 0.4501058 2.672 0.00763 **
## BT1 0.3568153 0.6581930 0.542 0.58782
## BT2 0.4648894 0.7681020 0.605 0.54511
## Attr_Ind -2.7124446 0.5644751 -4.805 1.70e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.518 on 1462 degrees of freedom
## Multiple R-squared: 0.326, Adjusted R-squared: 0.3227
## F-statistic: 101 on 7 and 1462 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = fit)
##
## Value p-value Decision
## Global Stat 147.935 0.0000000 Assumptions NOT satisfied!
## Skewness 120.341 0.0000000 Assumptions NOT satisfied!
## Kurtosis 8.124 0.0043692 Assumptions NOT satisfied!
## Link Function 17.050 0.0000364 Assumptions NOT satisfied!
## Heteroscedasticity 2.419 0.1198383 Assumptions acceptable.
## input Attrition
fitAttr <- lm( MonthlyIncome ~ Age + NumCompaniesWorked + YearsAtCompany + OT_ind + BT1 + BT2 + Attr_Ind, data=emp)
summary(fitAttr)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + NumCompaniesWorked + YearsAtCompany +
## OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9913.0 -2282.8 -478.7 1590.7 13260.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2695.11 491.47 -5.484 4.9e-08 ***
## Age 167.94 11.90 14.119 < 2e-16 ***
## NumCompaniesWorked 195.08 41.27 4.727 2.5e-06 ***
## YearsAtCompany 320.99 16.85 19.053 < 2e-16 ***
## OT_ind 193.53 218.71 0.885 0.37635
## BT1 381.18 319.00 1.195 0.23230
## BT2 110.22 372.45 0.296 0.76733
## Attr_Ind -769.36 275.10 -2.797 0.00523 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3645 on 1462 degrees of freedom
## Multiple R-squared: 0.4034, Adjusted R-squared: 0.4005
## F-statistic: 141.2 on 7 and 1462 DF, p-value: < 2.2e-16
# Assessing Outliers
#outlierTest(fit) # Bonferonni p-value for most extreme obs
qqPlot(fitAttr, main="QQ Plot") #qq plot for studentized resid
## [1] 1130 1438
leveragePlots(fitAttr) # leverage plots
# Normality of Residuals
# distribution of studentized residuals
library(MASS)
sresid <- studres(fitAttr)
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fitAttr)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 237.7393, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values
spreadLevelPlot(fitAttr)
## Warning in spreadLevelPlot.lm(fitAttr):
## 2 negative fitted values removed
##
## Suggested power transformation: 0.07475701
# Evaluate Collinearity
vif(fitAttr) # variance inflation factors
## Age NumCompaniesWorked YearsAtCompany
## 1.305565 1.175022 1.177871
## OT_ind BT1 BT2
## 1.073766 2.320309 2.346954
## Attr_Ind
## 1.132278
sqrt(vif(fitAttr)) > 2 # problem?
## Age NumCompaniesWorked YearsAtCompany
## FALSE FALSE FALSE
## OT_ind BT1 BT2
## FALSE FALSE FALSE
## Attr_Ind
## FALSE
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fitAttr)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 237.7393, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values
spreadLevelPlot(fitAttr)
## Warning in spreadLevelPlot.lm(fitAttr):
## 2 negative fitted values removed
##
## Suggested power transformation: 0.07475701
# Evaluate Nonlinearity
# component + residual plot
crPlots(fitAttr)
# Ceres plots
#ceresPlots(fitAttr)
# Test for Autocorrelated Errors
durbinWatsonTest(fitAttr)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01872124 2.037305 0.458
## Alternative hypothesis: rho != 0
# Global test of model assumptions
library(gvlma)
gvmodel <- gvlma(fitAttr)
summary(gvmodel)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + NumCompaniesWorked + YearsAtCompany +
## OT_ind + BT1 + BT2 + Attr_Ind, data = emp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9913.0 -2282.8 -478.7 1590.7 13260.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2695.11 491.47 -5.484 4.9e-08 ***
## Age 167.94 11.90 14.119 < 2e-16 ***
## NumCompaniesWorked 195.08 41.27 4.727 2.5e-06 ***
## YearsAtCompany 320.99 16.85 19.053 < 2e-16 ***
## OT_ind 193.53 218.71 0.885 0.37635
## BT1 381.18 319.00 1.195 0.23230
## BT2 110.22 372.45 0.296 0.76733
## Attr_Ind -769.36 275.10 -2.797 0.00523 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3645 on 1462 degrees of freedom
## Multiple R-squared: 0.4034, Adjusted R-squared: 0.4005
## F-statistic: 141.2 on 7 and 1462 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = fitAttr)
##
## Value p-value Decision
## Global Stat 307.4495 0.00000 Assumptions NOT satisfied!
## Skewness 195.9303 0.00000 Assumptions NOT satisfied!
## Kurtosis 106.3839 0.00000 Assumptions NOT satisfied!
## Link Function 4.5831 0.03229 Assumptions NOT satisfied!
## Heteroscedasticity 0.5522 0.45741 Assumptions acceptable.
library(tidyr) # data tidying (e.g., spread)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
## The following object is masked from 'package:pastecs':
##
## extract
## The following object is masked from 'package:magrittr':
##
## extract
# explore status/terminations by Department
#status_count <- with(emp, table(Department, Attrition))
#status_count <- spread(data.frame(status_count), Attrition, Freq)
#status_count$previous_active <- shift(status_count$ACTIVE, 1L, type = "lag")
#status_count$percent_terminated <- 100*status_count$Yes / status_count$previous_active
#status_count
# chart data in questions
#ggplot() + geom_bar(aes(y = ..count..,x = STATUS_YEAR, fill = termreason_desc), data=terms, position = position_stack()) +
# theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# "Age","BusinessTravel","Department","DistanceFromHome","Education","EducationField", "Gender", "JobInvolvement", "JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome", "NumCompaniesWorked", "Overtime", "PercentSalaryHike","PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager","Attrition"
# Partition the data into training and test sets
term_vars <- c("Age","Education","MonthlyIncome","BusinessTravel","Department","DistanceFromHome","EducationField", "Gender","JobInvolvement","JobLevel", "JobSatisfaction", "MaritalStatus", "MonthlyIncome","NumCompaniesWorked", "OverTime","PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager","Attrition")
emp_term_train <- subset(emp, Education < 4)
emp_term_test <- subset(emp, Education >= 4)
set.seed(99) # set a pre-defined value for the random seed so that results are repeatable
# Decision tree model
rpart_model <- rpart(Attrition ~.,
data = emp_term_train[term_vars],
method = 'class',
parms = list(split='information'),
control = rpart.control(usesurrogate = 0,
maxsurrogate = 0))
# Plot the decision tree
rpart.plot(rpart_model, roundint = FALSE, type = 3)
##
# plot terminated & active by age
emp$resigned <- as.factor(emp$Attrition)
summary(emp$resigned)
## No Yes
## 1233 237
featurePlot(x=emp$Age, y=emp$resigned, plot="density",
auto.key = list(columns = 2), labels = c("Age (years)", ""))
##
# plot terminated & active by MonthlyIncome
featurePlot(x=emp$MonthlyIncome, y=emp$resigned, plot="density",
auto.key = list(columns = 2), labels = c("Monthly Income", ""))
##
featurePlot(x=emp$YearsAtCompany, y=emp$resigned, plot="density",
auto.key = list(columns = 2), labels = c("Monthly Income", ""))
# Subset the data
emp_cat <- data.frame(emp$Age,emp$Education,emp$MonthlyIncome,emp$DistanceFromHome,emp$JobInvolvement,emp$JobLevel, emp$JobSatisfaction, emp$NumCompaniesWorked, emp$PercentSalaryHike,emp$PerformanceRating, emp$StockOptionLevel, emp$TotalWorkingYears,emp$TrainingTimesLastYear, emp$WorkLifeBalance,emp$RelationshipSatisfaction,emp$YearsAtCompany,emp$YearsInCurrentRole,emp$resigned,emp$EmployeeNumber)
col_headings <- c("Age","Education","MonthlyIncome","DistanceFromHome","JobInvolvement","JobLevel", "JobSatisfaction", "NumCompaniesWorked", "PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole","resigned","EmployeeNumber")
names(emp_cat) <- col_headings
emp_train <- subset(emp_cat, Education < 4)
emp_test <- subset(emp_cat, Education >= 4)
emp_train_rose <- ROSE(resigned ~ ., data = emp_train, seed=125)$data
# Tables to show balanced dataset sample sizes
table(emp_train_rose$resigned)
##
## No Yes
## 521 503
# Select variables (res_vars) for the model to predict 'resigned'
res_vars <- c("Age","Education","MonthlyIncome","DistanceFromHome","JobInvolvement","JobLevel", "JobSatisfaction", "NumCompaniesWorked", "PercentSalaryHike","PerformanceRating", "StockOptionLevel", "TotalWorkingYears","TrainingTimesLastYear", "WorkLifeBalance","RelationshipSatisfaction","YearsAtCompany","YearsInCurrentRole","resigned")
set.seed(222)
emp_res_rose_RF <- randomForest(resigned ~ .,
data = emp_train_rose[res_vars],
ntree=500, importance = TRUE,
na.action = na.omit)
varImpPlot(emp_res_rose_RF,type=1,
main="Variable Importance (Accuracy)",
sub = "Random Forest Model")
#var_importance <- importance(emp_res_rose_RF)
#emp_res_rose_RF # view results & Confusion matrix
# generate predictions based on test data ("emp_test")
emp_res_rose_RF_pred <- predict(emp_res_rose_RF, newdata = emp_test)
confusionMatrix(data = emp_res_rose_RF_pred,
reference = emp_test$resigned,
positive = "Yes", mode = "prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 255 32
## Yes 128 31
##
## Accuracy : 0.6413
## 95% CI : (0.5948, 0.6858)
## No Information Rate : 0.8587
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0965
## Mcnemar's Test P-Value : 5.894e-14
##
## Precision : 0.19497
## Recall : 0.49206
## F1 : 0.27928
## Prevalence : 0.14126
## Detection Rate : 0.06951
## Detection Prevalence : 0.35650
## Balanced Accuracy : 0.57893
##
## 'Positive' Class : Yes
##
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
# Calculate prediction probabilites of employees who will resign
emp_res_rose_RF_pred_probs <- predict(emp_res_rose_RF, emp_test, type="prob")
Employees_flight_risk <- as.data.frame(cbind(emp_test$EmployeeNumber,emp_res_rose_RF_pred_probs))
col_headings <- c("EmployeeNumber","No","Yes")
names(Employees_flight_risk) <- col_headings
Employees_flight_risk <- arrange(Employees_flight_risk, desc(Yes))
head(Employees_flight_risk)
## EmployeeNumber No Yes
## 1 565 0.090 0.910
## 2 939 0.116 0.884
## 3 2010 0.188 0.812
## 4 1427 0.192 0.808
## 5 648 0.202 0.798
## 6 1842 0.202 0.798
tail(Employees_flight_risk)
## EmployeeNumber No Yes
## 441 1630 0.732 0.268
## 442 1061 0.740 0.260
## 443 1932 0.744 0.256
## 444 1225 0.754 0.246
## 445 323 0.760 0.240
## 446 1116 0.810 0.190